# SEE modeldata package for new datasets
library(tidyverse) # for graphing and data cleaning
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'tibble' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels) # for modeling
## Warning: package 'tidymodels' was built under R version 4.0.4
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.3 v recipes 0.1.15
## v dials 0.0.9 v rsample 0.0.9
## v infer 0.5.4 v tune 0.1.3
## v modeldata 0.1.0 v workflows 0.2.2
## v parsnip 0.1.5 v yardstick 0.0.7
## Warning: package 'dials' was built under R version 4.0.4
## Warning: package 'infer' was built under R version 4.0.4
## Warning: package 'modeldata' was built under R version 4.0.4
## Warning: package 'parsnip' was built under R version 4.0.4
## Warning: package 'recipes' was built under R version 4.0.4
## Warning: package 'rsample' was built under R version 4.0.4
## Warning: package 'tune' was built under R version 4.0.4
## Warning: package 'workflows' was built under R version 4.0.4
## Warning: package 'yardstick' was built under R version 4.0.4
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
library(stacks) # for stacking models
## Warning: package 'stacks' was built under R version 4.0.4
library(naniar) # for examining missing values (NAs)
## Warning: package 'naniar' was built under R version 4.0.4
library(lubridate) # for date manipulation
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(moderndive) # for King County housing data
## Warning: package 'moderndive' was built under R version 4.0.4
library(vip) # for variable importance plots
## Warning: package 'vip' was built under R version 4.0.4
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(DALEX) # for model interpretation
## Warning: package 'DALEX' was built under R version 4.0.4
## Welcome to DALEX (version: 2.2.0).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
library(DALEXtra) # for extension of DALEX
## Warning: package 'DALEXtra' was built under R version 4.0.4
library(patchwork) # for combining plots nicely
theme_set(theme_minimal()) # Lisa's favorite theme
library(ranger)
## Warning: package 'ranger' was built under R version 4.0.4
library(kknn)
## Warning: package 'kknn' was built under R version 4.0.4
data("lending_club")
# Data dictionary (as close as I could find): https://www.kaggle.com/wordsforthewise/lending-club/discussion/170691
When you finish the assignment, remove the # from the options chunk at the top, so that messages and warnings aren’t printed. If you are getting errors in your code, add error = TRUE so that the file knits. I would recommend not removing the # until you are completely finished.
From now on, GitHub should be part of your routine when doing assignments. I recommend making it part of your process anytime you are working in R, but I’ll make you show it’s part of your process for assignments.
Task: When you are finished with the assignment, post a link below to the GitHub repo for the assignment. If you want to post it to your personal website, that’s ok (not required). Make sure the link goes to a spot in the repo where I can easily find this assignment. For example, if you have a website with a blog and post the assignment as a blog post, link to the post’s folder in the repo. As an example, I’ve linked to my GitHub stacking material here.
Before jumping into these problems, you should read through (and follow along with!) the model stacking and global model interpretation tutorials on the Course Materials tab of the course website.
We’ll be using the lending_club dataset from the modeldata library, which is part of tidymodels. The data dictionary they reference doesn’t seem to exist anymore, but it seems the one on this kaggle discussion is pretty close. It might also help to read a bit about Lending Club before starting in on the exercises.
The outcome we are interested in predicting is Class. And according to the dataset’s help page, its values are “either ‘good’ (meaning that the loan was fully paid back or currently on-time) or ‘bad’ (charged off, defaulted, of 21-120 days late)”.
Tasks: I will be expanding these, but this gives a good outline.
lending_club
lending_club %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(vars(variable),
scales = "free")
lending_club %>%
select(where(is.factor)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value") %>%
ggplot(aes(x = value)) +
geom_bar() +
facet_wrap(vars(variable),
scales = "free",
nrow = 2)
#get rid of zero or near zero variance variables
lending_club2 <- lending_club %>%
na.omit() %>% #note sure if this is right cos it makes no difference
select(-delinq_amnt) %>%
select(-acc_now_delinq)
lending_club
lending_club2
#can't identify duplicate response variables
Be sure to add more “bad” Classes. This is not the best solution, but it will work for now. (Should investigate how to appropriately use step_sample_up() function from themis).
create_more_bad <- lending_club2 %>%
filter(Class == "bad") %>%
sample_n(size = 3000, replace = TRUE)
lending_club_mod <- lending_club2 %>%
bind_rows(create_more_bad)
# lending_club_mod <- lending_club_mod %>%
# mutate(Class_good = (Class == 'good')) %>%
# select(-Class) # not needed to do this?
set.seed(494) # for reproducibility
lending_split <- initial_split(lending_club_mod, prop = 0.75)
lending_training <- training(lending_split)
lending_testing <- testing(lending_split)
step_mutate_at() or this will be a lot of code). We’ll want to do this for the model interpretation we’ll do later.lending_club_mod
lending_recipe <- recipe(Class ~ . , data = lending_training) %>%
#all integer variables are numeric
step_mutate_at(all_numeric(), fn = ~as.numeric(.)) %>%
#categorical variables are dummy variables
step_dummy(all_nominal(),-all_outcomes()) %>%
#quantitative variables are normalized
step_normalize(all_predictors(),
-all_nominal(),
-has_role(match = 'evaluative'))
lending_recipe %>%
prep(lending_training) %>%
juice()